from pyspark import SparkConf, SparkContext
import os

# 设置python解释器环境变量
os.environ['PYSPARK_PYTHON'] = 'D:/PYTHON/python3.10/python.exe'

conf = SparkConf().setMaster('local[*]').setAppName('my_test_spark')
sc = SparkContext(conf=conf)

# 数据输入
rdd = sc.textFile('E:/PythonBasicKnowledge/12 pyspark使用/统计单词数量.txt')

# 数据处理
rdd1 = rdd.flatMap(lambda x: x.split(' '))

rdd2 = rdd1.map(lambda x: (x, 1))

rdd3 = rdd2.reduceByKey(lambda a, b: a + b)

res = rdd3.sortBy(lambda x: x[1], ascending=False, numPartitions=1)

print(res.collect())


sc.stop()
